Logistic Regression Model

Author

Hope Donoghue

Dataset: shots_valid_wwc_log

library(readr)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.0     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)

Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout
library(ggsoccer)
library(broom)
library(stringr)
library(SBpitch)
library(ggplot2)
library(caret)
Loading required package: lattice

Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift
pitch = create_Pitch("#ffffff", "#A9A9A9", "#ffffff", "#000000", BasicFeatures = FALSE, goaltype = "box")
#write.csv(shots_valid_wwc2023_new, file = "shots_valid_wwc2023_new.csv")
shots_valid_wwc2023_new <- read_csv("shots_valid_wwc2023_new.csv")
New names:
Rows: 1540 Columns: 90
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(18): id, possession_team.name.x, team.name.x, player.name.GK.x, oppone... dbl
(60): ...1, match_id.x, index, period, minute, second, possession, dura... lgl
(11): under_pressure, off_camera, shot.first_time, shot.aerial_won, sho... time
(1): timestamp
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`
shots_valid_wwc_log <- read_csv("shots_valid_wwc_log.csv")
Rows: 1540 Columns: 99
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (21): id, possession_team.name.x, team.name.x, player.name.GK.x, oppone...
dbl  (66): match_id.x, index, period, minute, second, possession, duration, ...
lgl  (11): under_pressure, off_camera, shot.first_time, shot.aerial_won, sho...
time  (1): timestamp

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Logistic regression part 2
# shot.type.name probably confounded so drop it from model entirely 
# indicator variable

library(broom)

ind.vars2 = c("id", 'goal', 'distance.to.gk', 'DistToGoal', 'angle.to.gk', 'angle.to.goal', 'play_pattern.name', 'shot.technique.name', 'shot.body_part.name')

shots.varsdata_log = subset(shots_valid_wwc2023_new, select = ind.vars2)  %>%
  drop_na()


#splitting into test and test with 80% split
idx_log = createDataPartition(shots.varsdata_log$goal, p = 0.8, list = F)
train_log = shots.varsdata_log[idx_log,]
test_log = shots.varsdata_log[-idx_log,]


goal_glm <- glm(goal ~ DistToGoal + angle.to.goal + distance.to.gk + angle.to.gk + play_pattern.name + shot.technique.name + shot.body_part.name, data = train_log, family = "binomial")

goal_glm |> tidy() |>
  print(n=Inf)
# A tibble: 18 × 5
   term                            estimate std.error statistic  p.value
   <chr>                              <dbl>     <dbl>     <dbl>    <dbl>
 1 (Intercept)                     -1.81      0.522     -3.46   0.000532
 2 DistToGoal                      -0.114     0.0550    -2.08   0.0376  
 3 angle.to.goal                   -0.00420   0.00395   -1.06   0.287   
 4 distance.to.gk                  -0.0505    0.0520    -0.970  0.332   
 5 angle.to.gk                      0.00561   0.00317    1.77   0.0766  
 6 play_pattern.nameFrom Counter    0.768     0.533      1.44   0.150   
 7 play_pattern.nameFrom Free Kick -0.657     0.537     -1.22   0.221   
 8 play_pattern.nameFrom Goal Kick  0.448     0.625      0.717  0.473   
 9 play_pattern.nameFrom Keeper     1.41      0.751      1.88   0.0596  
10 play_pattern.nameFrom Throw In   0.253     0.363      0.698  0.485   
11 play_pattern.nameOther           0.0439    0.895      0.0491 0.961   
12 play_pattern.nameRegular Play    0.552     0.345      1.60   0.110   
13 shot.technique.nameLob           2.60      1.43       1.81   0.0702  
14 shot.technique.nameNormal        0.735     0.354      2.07   0.0381  
15 shot.technique.nameVolley       -0.291     0.562     -0.517  0.605   
16 shot.body_part.nameLeft Foot     0.714     0.391      1.83   0.0680  
17 shot.body_part.nameOther         1.64      0.658      2.50   0.0125  
18 shot.body_part.nameRight Foot    0.939     0.358      2.63   0.00863 
library(modelr)

Attaching package: 'modelr'
The following object is masked from 'package:broom':

    bootstrap
augment <-augment(goal_glm, newdata = test_log,
                    se_fit = TRUE)

augment
# A tibble: 308 × 11
   id                   goal distance.to.gk DistToGoal angle.to.gk angle.to.goal
   <chr>               <dbl>          <dbl>      <dbl>       <dbl>         <dbl>
 1 28207cdc-c5f8-49b3…     0           8.00       8.93        47.6          48.7
 2 29829db0-fe0a-4298…     0           8.18      12.3         26.9          25.1
 3 50bab4fc-ed55-4409…     0          11.9       12.3         95.3          95.6
 4 0adcc403-17cf-4e87…     0          27.8       28.8         99.5         101. 
 5 f6964d5d-e478-4174…     0           8.16       9.31        70.7          65.9
 6 a7d3f03e-ae8e-4eb1…     1          16.1       20.2         42.5          39.8
 7 e4ffb2b6-6f5f-4c09…     0          14.5       16.8        126.          130. 
 8 ebfd65eb-42f0-4ed7…     0           8.77      10.8         45.0          53.7
 9 fe0c0079-ef00-4473…     0          13.1       17.3         27.2          25.4
10 3f550539-9342-4e7b…     0          21.5       24.6         53.9          51.6
# ℹ 298 more rows
# ℹ 5 more variables: play_pattern.name <chr>, shot.technique.name <chr>,
#   shot.body_part.name <chr>, .fitted <dbl>, .se.fit <dbl>
# Convert .fitted values to predicted probabilities

aug_prob <- augment %>%
  mutate(.fitted = round(exp(.fitted)/(1+exp(.fitted)), 4))

aug_prob
# A tibble: 308 × 11
   id                   goal distance.to.gk DistToGoal angle.to.gk angle.to.goal
   <chr>               <dbl>          <dbl>      <dbl>       <dbl>         <dbl>
 1 28207cdc-c5f8-49b3…     0           8.00       8.93        47.6          48.7
 2 29829db0-fe0a-4298…     0           8.18      12.3         26.9          25.1
 3 50bab4fc-ed55-4409…     0          11.9       12.3         95.3          95.6
 4 0adcc403-17cf-4e87…     0          27.8       28.8         99.5         101. 
 5 f6964d5d-e478-4174…     0           8.16       9.31        70.7          65.9
 6 a7d3f03e-ae8e-4eb1…     1          16.1       20.2         42.5          39.8
 7 e4ffb2b6-6f5f-4c09…     0          14.5       16.8        126.          130. 
 8 ebfd65eb-42f0-4ed7…     0           8.77      10.8         45.0          53.7
 9 fe0c0079-ef00-4473…     0          13.1       17.3         27.2          25.4
10 3f550539-9342-4e7b…     0          21.5       24.6         53.9          51.6
# ℹ 298 more rows
# ℹ 5 more variables: play_pattern.name <chr>, shot.technique.name <chr>,
#   shot.body_part.name <chr>, .fitted <dbl>, .se.fit <dbl>
shots.varsdata_log <- augment(goal_glm, newdata = shots.varsdata_log, se_fit = TRUE) %>%
  mutate(.fitted = round(exp(.fitted)/(1+exp(.fitted)), 4))
# new dataset to USE!!!
shots_valid_wwc_log <- left_join(shots_valid_wwc2023_new, shots.varsdata_log, by = "id") 
# eliminate unnecessary variables with matrix and list notation
#shots_valid_wwc_log_new <- shots_valid_wwc_log %>%
  #select(-c(related_events, location, shot.end_location, shot.freeze_frame, tactics.lineup, pass.end_location, carry.end_location, goalkeeper.end_location, out, my.xG))
# create csv file so don't have to keep going back and forth 
#write_csv(shots_valid_wwc_log_new, file = "shots_valid_wwc_log.csv")
player_dataset_log <- shots_valid_wwc_log %>%
    rename("Play_Type" = play_pattern.name.y) %>%
    rename("Opponent" = opponent) %>%
    rename("Predicted_xG" = .fitted) %>%
    rename("Shot_body_part" = shot.body_part.name.y) %>%
    rename("Shot_type" = shot.technique.name.x)
# MSE between logistic xg predictions and statsbomb predictions

(sum((shots_valid_wwc_log$shot.statsbomb_xg - shots_valid_wwc_log$.fitted)^2)/1540)
[1] 0.01012122

MSE is 0.009

Table:

player_table_shots <- player_dataset_log %>%
  group_by(player.name) %>%
  tally(name = "total_shots", sort = TRUE)

player_table_goals <- player_dataset_log %>%
  filter(is.goal == "Goal") %>%
  group_by(player.name) %>%
  tally(name = "goals", sort = TRUE)

player_table_xg <- player_dataset_log %>%
  group_by(player.name) %>%
  tally(Predicted_xG, name = "total_xG", sort = TRUE)

summary_data_table <- left_join(player_table_xg, player_table_shots, by = "player.name") %>%
  mutate(xG_per_shot = sprintf("%0.2f", total_xG/total_shots))

summary_data_table_log <- left_join(summary_data_table, player_table_goals, by = "player.name") %>%
   mutate_all(~replace(., is.na(.), 0))
# pull out levels of players and team
library(tidyverse)
players_unique <- player_dataset_log %>%
  pull(player.name) %>%
  unique()

team_unique <- player_dataset_log %>% 
  pull(team.name.x) %>%
  unique()
# shiny App for logistic regression with shot map for players from certain teams
library(shiny)
library(plotly)
library(ggsoccer)

ui <- fluidPage(sidebarLayout(
  sidebarPanel(
    selectInput("team_sel", "Choose a team:", choices = sort(team_unique)),
    selectInput("player_sel", "Choose a player:", choices = NULL)),
  
  mainPanel(plotlyOutput("shot_map"), tableOutput("table"))
  )
  
)

server <- function(input, output, session) {
  
  observeEvent(input$team_sel, {
    
    player_choices <- player_dataset_log %>% 
      filter(team.name.x == input$team_sel) %>% 
      distinct(player.name) %>% pull(player.name)
    
    updateSelectInput(inputId = "player_sel", 
                      choices = sort(player_choices))
  })
  
  player_team <- reactive({
    player_team <- player_dataset_log %>%
      filter(player.name == input$player_sel) %>%
      filter(team.name.x == input$team_sel) 
    
  })
  
  output$shot_map <- renderPlotly({
  plotly_shot_map <- ggplot(data = player_team(), aes(x = location.x, y = location.y, color = Predicted_xG, stroke = is.goal_numeric, label = Play_Type, thirdlabel = Shot_body_part, fourthlabel = Shot_type, secondlabel = Opponent)) +
   annotate_pitch(dimensions = pitch_statsbomb) + theme_pitch() + coord_flip(xlim = c(55, 120), ylim = c(-12, 105)) + geom_point(shape = 1, size = 1.5) + labs(title = glue::glue(" Expected Goal Model: \n Shot Map for ", input$player_sel), color = "Probability of Goal \n (xG)") + scale_colour_gradientn(colors = c(low = "blue", high = "red"), limits=c(0, 1))
    
  ggplotly(plotly_shot_map, tooltip = c("color", "label", "thirdlabel", "fourthlabel", "secondlabel"))
  })
  
  output$table <- renderTable({
    
    player_table_shots <- player_team() %>%
      group_by(player.name) %>%
      tally(name = "total_shots", sort = TRUE)

    player_table_goals <- player_team() %>%
      filter(is.goal == "Goal") %>%
      group_by(player.name) %>%
      tally(name = "goals", sort = TRUE)
    
    player_table_xg <- player_team() %>%
      group_by(player.name) %>%
      tally(Predicted_xG, name = "total_xG", sort = TRUE)
    
    summary_data_table <-
      left_join(player_table_xg, player_table_shots, by = "player.name") %>%
      mutate(xG_per_shot = sprintf("%0.2f", total_xG / total_shots))
    
    summary_data_table <-
      left_join(summary_data_table, player_table_goals, by = "player.name") %>%
      mutate_all(~replace(., is.na(.), 0)) %>%
      select(total_xG, total_shots, xG_per_shot, goals)
    
  })
  
}

shinyApp(ui, server)

Shiny applications not supported in static R Markdown documents

Specific player shot map:

alex_morgan <- player_dataset_log %>%
      filter(player.name == "Alexandra Morgan Carrasco")
hinata_miyazawa <- player_dataset_log %>%
  filter(player.name == "Hinata Miyazawa")
alessia_russo <- player_dataset_log %>%
  filter(player.name == "Alessia Russo")
plotly_shot_map <- ggplot(data = alex_morgan, aes(x = location.x, y = location.y, color = Predicted_xG, stroke = is.goal_numeric, label = Play_Type, secondlabel = Opponent)) +
   annotate_pitch(dimensions = pitch_statsbomb) + theme_pitch() + coord_flip(xlim = c(55, 120), ylim = c(-12, 105)) + geom_point(shape = 1, size = 1.5) + labs(title = glue::glue(" xG Shot Map for Alex Morgan"), color = "xG") + scale_colour_gradientn(colors = c(low = "blue", high = "red"), limits=c(0, 1))

ggplotly(plotly_shot_map, tooltip = c("color", "label", "secondlabel"))
Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
  If you'd like to see this geom implemented,
  Please open an issue with your example code at
  https://github.com/ropensci/plotly/issues

Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
  If you'd like to see this geom implemented,
  Please open an issue with your example code at
  https://github.com/ropensci/plotly/issues

Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
  If you'd like to see this geom implemented,
  Please open an issue with your example code at
  https://github.com/ropensci/plotly/issues

Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
  If you'd like to see this geom implemented,
  Please open an issue with your example code at
  https://github.com/ropensci/plotly/issues
Warning: Aspect ratios aren't yet implemented, but you can manually set a
suitable height/width

Warning: Aspect ratios aren't yet implemented, but you can manually set a
suitable height/width
plotly_shot_map <- ggplot(data = hinata_miyazawa, aes(x = location.x, y = location.y, color = Predicted_xG, stroke = is.goal_numeric, label = Play_Type, secondlabel = Opponent)) +
   annotate_pitch(dimensions = pitch_statsbomb) + theme_pitch() + coord_flip(xlim = c(55, 120), ylim = c(-12, 105)) + geom_point(shape = 1, size = 1.5) + labs(title = glue::glue(" xG Shot Map for Hinata Miyazawa"), color = "xG") + scale_colour_gradientn(colors = c(low = "blue", high = "red"), limits=c(0, 1))

ggplotly(plotly_shot_map, tooltip = c("color", "label", "secondlabel"))
Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
  If you'd like to see this geom implemented,
  Please open an issue with your example code at
  https://github.com/ropensci/plotly/issues

Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
  If you'd like to see this geom implemented,
  Please open an issue with your example code at
  https://github.com/ropensci/plotly/issues

Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
  If you'd like to see this geom implemented,
  Please open an issue with your example code at
  https://github.com/ropensci/plotly/issues

Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
  If you'd like to see this geom implemented,
  Please open an issue with your example code at
  https://github.com/ropensci/plotly/issues
Warning: Aspect ratios aren't yet implemented, but you can manually set a
suitable height/width

Warning: Aspect ratios aren't yet implemented, but you can manually set a
suitable height/width
plotly_shot_map <- ggplot(data = alessia_russo, aes(x = location.x, y = location.y, color = Predicted_xG, stroke = is.goal_numeric, label = Play_Type, secondlabel = Opponent)) +
   annotate_pitch(dimensions = pitch_statsbomb) + theme_pitch() + coord_flip(xlim = c(55, 120), ylim = c(-12, 105)) + geom_point(shape = 1, size = 1.5) + labs(title = glue::glue(" xG Shot Map for Alessia Russo"), color = "xG") + scale_colour_gradientn(colors = c(low = "blue", high = "red"), limits=c(0, 1))

ggplotly(plotly_shot_map, tooltip = c("color", "label", "secondlabel"))
Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
  If you'd like to see this geom implemented,
  Please open an issue with your example code at
  https://github.com/ropensci/plotly/issues

Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
  If you'd like to see this geom implemented,
  Please open an issue with your example code at
  https://github.com/ropensci/plotly/issues

Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
  If you'd like to see this geom implemented,
  Please open an issue with your example code at
  https://github.com/ropensci/plotly/issues

Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomCurve() has yet to be implemented in plotly.
  If you'd like to see this geom implemented,
  Please open an issue with your example code at
  https://github.com/ropensci/plotly/issues
Warning: Aspect ratios aren't yet implemented, but you can manually set a
suitable height/width

Warning: Aspect ratios aren't yet implemented, but you can manually set a
suitable height/width
# using open point shape to include stroke and color to the shot map
# stroke argument needs numeric variables
# Expected goals model overall for WWC 2023

shots_valid_wwc_log <- shots_valid_wwc_log %>%
  mutate(is.goal_numeric = if_else(is.goal == "Goal", 1.5, 0.5))

a8 = pitch + geom_point(data = shots_valid_wwc_log, aes(x = location.x, y = location.y, color = .fitted, stroke = is.goal_numeric), shape = 1, size = 2) + labs(color = "Probability of Goal \n (xG)", caption = "Removed Republic of Ireland's goal directly from corner; Darker filled in circles mean goal was scored", title = "Expected Goals Model WWC 2023") + scale_colour_gradientn(colors = c(low = "blue", high = "red"), limits=c(0, 1))

a8

Investigating higher probability shots

# higher probability for lob shots why 83% in model only 4 lobs in all of dataset and this went in
# https://www.foxsports.com/watch/play-68cf11a10000102
shots.varsdata_log %>%
  group_by(shot.technique.name, goal) %>%
  summarise(n())
`summarise()` has grouped output by 'shot.technique.name'. You can override
using the `.groups` argument.
# A tibble: 8 × 3
# Groups:   shot.technique.name [4]
  shot.technique.name  goal `n()`
  <chr>               <dbl> <int>
1 Half Volley             0   257
2 Half Volley             1    19
3 Lob                     0     3
4 Lob                     1     1
5 Normal                  0  1044
6 Normal                  1   107
7 Volley                  0   102
8 Volley                  1     7
# south africa goal vs sweden - tap in with chest - high probability 
# https://www.foxsports.com/watch/play-688491e38000102
goal_of_interest <- shots_valid_wwc_log %>%
  filter(id == "3930cdb9-7755-4f95-b1b1-1babbedb9238")
# Costa Rica vs Zambia
# From corner 
second_goal_interest <- shots_valid_wwc_log %>%
  filter(id == "db185c6e-74b6-436d-9720-9c4da1517f3e")